import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calmap
from pandas_profiling import ProfileReport
Link to data source: https://www.kaggle.com/aungpyaeap/supermarket-sales
Context
The growth of supermarkets in most populated cities are increasing and market competitions are also high. The dataset is one of the historical sales of supermarket company which has recorded in 3 different branches for 3 months data.
Data Dictionary
Invoice id: Computer generated sales slip invoice identification number
Branch: Branch of supercenter (3 branches are available identified by A, B and C).
City: Location of supercenters
Customer type: Type of customers, recorded by Members for customers using member card and Normal for without member card.
Gender: Gender type of customer
Product line: General item categorization groups - Electronic accessories, Fashion accessories, Food and beverages, Health and beauty, Home and lifestyle, Sports and travel
Unit price: Price of each product in USD
Quantity: Number of products purchased by customer
Tax: 5% tax fee for customer buying
Total: Total price including tax
Date: Date of purchase (Record available from January 2019 to March 2019)
Time: Purchase time (10am to 9pm)
Payment: Payment used by customer for purchase (3 methods are available – Cash, Credit card and Ewallet)
COGS: Cost of goods sold
Gross margin percentage: Gross margin percentage
Gross income: Gross income
Rating: Customer stratification rating on their overall shopping experience (On a scale of 1 to 10)
df = pd.read_csv('supermarket_sales.csv')
df.tail(10)
| Invoice ID | Branch | City | Customer type | Gender | Product line | Unit price | Quantity | Tax 5% | Total | Date | Time | Payment | cogs | gross margin percentage | gross income | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 993 | 690-01-6631 | B | Mandalay | Normal | Male | Fashion accessories | NaN | 10.0 | 8.7450 | 183.6450 | 2/22/19 | 18:35 | Ewallet | 174.90 | 4.761905 | 8.7450 | 6.6 |
| 994 | 652-49-6720 | C | Naypyitaw | Member | Female | Electronic accessories | NaN | 1.0 | 3.0475 | 63.9975 | 2/18/19 | 11:40 | Ewallet | 60.95 | 4.761905 | 3.0475 | 5.9 |
| 995 | 233-67-5758 | C | Naypyitaw | Normal | Male | Health and beauty | NaN | 1.0 | 2.0175 | 42.3675 | 1/29/19 | 13:46 | Ewallet | 40.35 | 4.761905 | 2.0175 | 6.2 |
| 996 | 303-96-2227 | B | Mandalay | Normal | Female | Home and lifestyle | NaN | 10.0 | 48.6900 | 1022.4900 | 3/2/19 | 17:16 | Ewallet | 973.80 | 4.761905 | 48.6900 | 4.4 |
| 997 | 727-02-1313 | A | Yangon | Member | Male | Food and beverages | NaN | 1.0 | 1.5920 | 33.4320 | 2/9/19 | 13:22 | Cash | 31.84 | 4.761905 | 1.5920 | 7.7 |
| 998 | 347-56-2442 | A | Yangon | Normal | Male | Home and lifestyle | 65.82 | 1.0 | 3.2910 | 69.1110 | 2/22/19 | 15:33 | Cash | 65.82 | 4.761905 | 3.2910 | 4.1 |
| 999 | 849-09-3807 | A | Yangon | Member | Female | Fashion accessories | 88.34 | 7.0 | 30.9190 | 649.2990 | 2/18/19 | 13:28 | Cash | 618.38 | 4.761905 | 30.9190 | 6.6 |
| 1000 | 849-09-3807 | A | Yangon | Member | Female | Fashion accessories | 88.34 | 7.0 | 30.9190 | 649.2990 | 2/18/19 | 13:28 | Cash | 618.38 | 4.761905 | 30.9190 | 6.6 |
| 1001 | 745-74-0715 | A | Yangon | Normal | Male | Electronic accessories | NaN | 2.0 | 5.8030 | 121.8630 | 3/10/19 | 20:46 | Ewallet | 116.06 | 4.761905 | 5.8030 | 8.8 |
| 1002 | 452-04-8808 | B | Mandalay | Normal | Male | Electronic accessories | 87.08 | NaN | 30.4780 | 640.0380 | 1/26/19 | 15:17 | Cash | 609.56 | 4.761905 | 30.4780 | 5.5 |
df.columns
Index(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender',
'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date',
'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income',
'Rating'],
dtype='object')
df.dtypes
Invoice ID object Branch object City object Customer type object Gender object Product line object Unit price float64 Quantity float64 Tax 5% float64 Total float64 Date object Time object Payment object cogs float64 gross margin percentage float64 gross income float64 Rating float64 dtype: object
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date' ,inplace=True)
df.describe()
| Unit price | Quantity | Tax 5% | Total | cogs | gross margin percentage | gross income | Rating | |
|---|---|---|---|---|---|---|---|---|
| count | 996.000000 | 983.000000 | 1003.000000 | 1003.000000 | 1003.000000 | 1003.000000 | 1003.000000 | 1003.000000 |
| mean | 55.764568 | 5.501526 | 15.400368 | 323.407726 | 308.007358 | 4.761905 | 15.400368 | 6.972682 |
| std | 26.510165 | 2.924673 | 11.715192 | 246.019028 | 234.303836 | 0.000000 | 11.715192 | 1.717647 |
| min | 10.080000 | 1.000000 | 0.508500 | 10.678500 | 10.170000 | 4.761905 | 0.508500 | 4.000000 |
| 25% | 33.125000 | 3.000000 | 5.894750 | 123.789750 | 117.895000 | 4.761905 | 5.894750 | 5.500000 |
| 50% | 55.420000 | 5.000000 | 12.096000 | 254.016000 | 241.920000 | 4.761905 | 12.096000 | 7.000000 |
| 75% | 78.085000 | 8.000000 | 22.539500 | 473.329500 | 450.790000 | 4.761905 | 22.539500 | 8.500000 |
| max | 99.960000 | 10.000000 | 49.650000 | 1042.650000 | 993.000000 | 4.761905 | 49.650000 | 10.000000 |
Question 1: What does the distribution of customer ratings looks like? Is it skewed?
sns.distplot(df["Rating"])
plt.axvline(x=np.mean(df['Rating']),c='red',ls='--',label='mean')
plt.axvline(x=np.percentile(df['Rating'],25),c='green' ,ls='--',label='25-75th percentile')
plt.axvline(x=np.percentile(df['Rating'],75),c='green' ,ls='--')
plt.legend()
<matplotlib.legend.Legend at 0x7f4c1751b0f0>
Question 2: Do aggregate sales numbers differ by much between branches?
sns.countplot(df['Branch'])
<AxesSubplot:xlabel='Branch', ylabel='count'>
df['Branch'].value_counts()
A 342 B 333 C 328 Name: Branch, dtype: int64
sns.countplot(df['Payment'])
<AxesSubplot:xlabel='Payment', ylabel='count'>
Question 3: Is there a relationship between gross income and customer ratings?
sns.regplot(df['Rating'],df['gross income'])
<AxesSubplot:xlabel='Rating', ylabel='gross income'>
sns.boxplot(x=df['Branch'],y=df['gross income'])
<AxesSubplot:xlabel='Branch', ylabel='gross income'>
sns.boxplot(x=df['Gender'],y=df['gross income'])
<AxesSubplot:xlabel='Gender', ylabel='gross income'>
Question 4: Is there a noticeable time trend in gross income?
df.head
<bound method NDFrame.head of Invoice ID Branch City Customer type Gender \
Date
2019-01-05 750-67-8428 A Yangon Member Female
2019-03-08 226-31-3081 C Naypyitaw Normal Female
2019-03-03 631-41-3108 A Yangon Normal Male
2019-01-27 123-19-1176 A Yangon Member Male
2019-02-08 373-73-7910 A Yangon Normal Male
... ... ... ... ... ...
2019-02-22 347-56-2442 A Yangon Normal Male
2019-02-18 849-09-3807 A Yangon Member Female
2019-02-18 849-09-3807 A Yangon Member Female
2019-03-10 745-74-0715 A Yangon Normal Male
2019-01-26 452-04-8808 B Mandalay Normal Male
Product line Unit price Quantity Tax 5% Total \
Date
2019-01-05 Health and beauty 74.69 7.0 26.1415 548.9715
2019-03-08 Electronic accessories 15.28 5.0 3.8200 80.2200
2019-03-03 Home and lifestyle 46.33 7.0 16.2155 340.5255
2019-01-27 Health and beauty 58.22 8.0 23.2880 489.0480
2019-02-08 Sports and travel 86.31 7.0 30.2085 634.3785
... ... ... ... ... ...
2019-02-22 Home and lifestyle 65.82 1.0 3.2910 69.1110
2019-02-18 Fashion accessories 88.34 7.0 30.9190 649.2990
2019-02-18 Fashion accessories 88.34 7.0 30.9190 649.2990
2019-03-10 Electronic accessories NaN 2.0 5.8030 121.8630
2019-01-26 Electronic accessories 87.08 NaN 30.4780 640.0380
Time Payment cogs gross margin percentage gross income \
Date
2019-01-05 13:08 Ewallet 522.83 4.761905 26.1415
2019-03-08 10:29 Cash 76.40 4.761905 3.8200
2019-03-03 13:23 Credit card 324.31 4.761905 16.2155
2019-01-27 20:33 Ewallet 465.76 4.761905 23.2880
2019-02-08 10:37 Ewallet 604.17 4.761905 30.2085
... ... ... ... ... ...
2019-02-22 15:33 Cash 65.82 4.761905 3.2910
2019-02-18 13:28 Cash 618.38 4.761905 30.9190
2019-02-18 13:28 Cash 618.38 4.761905 30.9190
2019-03-10 20:46 Ewallet 116.06 4.761905 5.8030
2019-01-26 15:17 Cash 609.56 4.761905 30.4780
Rating
Date
2019-01-05 9.1
2019-03-08 9.6
2019-03-03 7.4
2019-01-27 8.4
2019-02-08 5.3
... ...
2019-02-22 4.1
2019-02-18 6.6
2019-02-18 6.6
2019-03-10 8.8
2019-01-26 5.5
[1003 rows x 16 columns]>
df.groupby(df.index).mean()
| Unit price | Quantity | Tax 5% | Total | cogs | gross margin percentage | gross income | Rating | |
|---|---|---|---|---|---|---|---|---|
| Date | ||||||||
| 2019-01-01 | 54.995833 | 6.454545 | 18.830083 | 395.431750 | 376.601667 | 4.761905 | 18.830083 | 6.583333 |
| 2019-01-02 | 44.635000 | 6.000000 | 11.580375 | 243.187875 | 231.607500 | 4.761905 | 11.580375 | 6.050000 |
| 2019-01-03 | 59.457500 | 4.625000 | 12.369813 | 259.766062 | 247.396250 | 4.761905 | 12.369813 | 8.112500 |
| 2019-01-04 | 51.743333 | 5.333333 | 12.886417 | 270.614750 | 257.728333 | 4.761905 | 12.886417 | 6.516667 |
| 2019-01-05 | 61.636667 | 4.583333 | 14.034458 | 294.723625 | 280.689167 | 4.761905 | 14.034458 | 7.433333 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2019-03-26 | 42.972308 | 4.000000 | 7.188692 | 150.962538 | 143.773846 | 4.761905 | 7.188692 | 6.623077 |
| 2019-03-27 | 56.841000 | 4.500000 | 13.822950 | 290.281950 | 276.459000 | 4.761905 | 13.822950 | 6.760000 |
| 2019-03-28 | 45.525000 | 4.800000 | 10.616200 | 222.940200 | 212.324000 | 4.761905 | 10.616200 | 7.050000 |
| 2019-03-29 | 66.346250 | 6.750000 | 23.947875 | 502.905375 | 478.957500 | 4.761905 | 23.947875 | 6.925000 |
| 2019-03-30 | 67.408182 | 5.888889 | 19.424500 | 407.914500 | 388.490000 | 4.761905 | 19.424500 | 6.800000 |
89 rows × 8 columns
df.groupby(df.index).mean().index
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
'2019-01-09', '2019-01-10', '2019-01-11', '2019-01-12',
'2019-01-13', '2019-01-14', '2019-01-15', '2019-01-16',
'2019-01-17', '2019-01-18', '2019-01-19', '2019-01-20',
'2019-01-21', '2019-01-22', '2019-01-23', '2019-01-24',
'2019-01-25', '2019-01-26', '2019-01-27', '2019-01-28',
'2019-01-29', '2019-01-30', '2019-01-31', '2019-02-01',
'2019-02-02', '2019-02-03', '2019-02-04', '2019-02-05',
'2019-02-06', '2019-02-07', '2019-02-08', '2019-02-09',
'2019-02-10', '2019-02-11', '2019-02-12', '2019-02-13',
'2019-02-14', '2019-02-15', '2019-02-16', '2019-02-17',
'2019-02-18', '2019-02-19', '2019-02-20', '2019-02-21',
'2019-02-22', '2019-02-23', '2019-02-24', '2019-02-25',
'2019-02-26', '2019-02-27', '2019-02-28', '2019-03-01',
'2019-03-02', '2019-03-03', '2019-03-04', '2019-03-05',
'2019-03-06', '2019-03-07', '2019-03-08', '2019-03-09',
'2019-03-10', '2019-03-11', '2019-03-12', '2019-03-13',
'2019-03-14', '2019-03-15', '2019-03-16', '2019-03-17',
'2019-03-18', '2019-03-19', '2019-03-20', '2019-03-21',
'2019-03-22', '2019-03-23', '2019-03-24', '2019-03-25',
'2019-03-26', '2019-03-27', '2019-03-28', '2019-03-29',
'2019-03-30'],
dtype='datetime64[ns]', name='Date', freq=None)
sns.lineplot(x=df.groupby(df.index).mean().index,
y=df.groupby(df.index).mean()['gross income'])
<AxesSubplot:xlabel='Date', ylabel='gross income'>
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x7f4c15181c50>
df.duplicated().sum()
0
df[df.duplicated()==True]
| Invoice ID | Branch | City | Customer type | Gender | Product line | Unit price | Quantity | Tax 5% | Total | Time | Payment | cogs | gross margin percentage | gross income | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | ||||||||||||||||
| 2019-02-18 | 849-09-3807 | A | Yangon | Member | Female | Fashion accessories | 88.34 | 7.0 | 30.919 | 649.299 | 13:28 | Cash | 618.38 | 4.761905 | 30.919 | 6.6 |
| 2019-03-10 | 745-74-0715 | A | Yangon | Normal | Male | Electronic accessories | NaN | 2.0 | 5.803 | 121.863 | 20:46 | Ewallet | 116.06 | 4.761905 | 5.803 | 8.8 |
| 2019-01-26 | 452-04-8808 | B | Mandalay | Normal | Male | Electronic accessories | 87.08 | NaN | 30.478 | 640.038 | 15:17 | Cash | 609.56 | 4.761905 | 30.478 | 5.5 |
df.drop_duplicates(inplace=True)
df.isna().sum()/len(df)
Invoice ID 0.000 Branch 0.000 City 0.000 Customer type 0.079 Gender 0.000 Product line 0.043 Unit price 0.006 Quantity 0.019 Tax 5% 0.000 Total 0.000 Time 0.000 Payment 0.000 cogs 0.000 gross margin percentage 0.000 gross income 0.000 Rating 0.000 dtype: float64
sns.heatmap(df.isnull(),cbar=False)
<AxesSubplot:ylabel='Date'>
df.fillna(df.mean(),inplace=True)
df.fillna(df.mode().iloc[0],inplace=True)
dataset = pd.read_csv('supermarket_sales.csv')
prof = ProfileReport(dataset)
prof
round(np.corrcoef(df['gross income'],df['Rating'])[1][0],2)
-0.04
np.round(df.corr(),2)
| Unit price | Quantity | Tax 5% | Total | cogs | gross margin percentage | gross income | Rating | |
|---|---|---|---|---|---|---|---|---|
| Unit price | 1.00 | 0.01 | 0.63 | 0.63 | 0.63 | -0.0 | 0.63 | -0.01 |
| Quantity | 0.01 | 1.00 | 0.70 | 0.70 | 0.70 | -0.0 | 0.70 | -0.02 |
| Tax 5% | 0.63 | 0.70 | 1.00 | 1.00 | 1.00 | 0.0 | 1.00 | -0.04 |
| Total | 0.63 | 0.70 | 1.00 | 1.00 | 1.00 | 0.0 | 1.00 | -0.04 |
| cogs | 0.63 | 0.70 | 1.00 | 1.00 | 1.00 | 0.0 | 1.00 | -0.04 |
| gross margin percentage | -0.00 | -0.00 | 0.00 | 0.00 | 0.00 | 1.0 | 0.00 | 0.00 |
| gross income | 0.63 | 0.70 | 1.00 | 1.00 | 1.00 | 0.0 | 1.00 | -0.04 |
| Rating | -0.01 | -0.02 | -0.04 | -0.04 | -0.04 | 0.0 | -0.04 | 1.00 |
sns.heatmap(np.round(df.corr(),2),annot=True)
<AxesSubplot:>